library(tidyverse)
library(dplyr)
library(maps)
library(spBayes)
library(viridis)
########
# Datasets description
# PA timestamp is recorded in GMT
# EPA timestamp is recorded in both GMT and EST
# However, the time difference between GMT and EST is always the same (5 hr)
# Which means that EPA timestamp may not account for the daytime light saving
# The latest EPA update is 08/31/2020
setwd("/Users/hongjianyang/Research/PA/")
pa2018 <- read.csv('Data/NC_PA_FRM_PM/PA_2018_hourly_PM_NC.csv')
# Subset data
col <- c('Lon', 'Lat', 'Timestamp', 'PM25_corrected')
pa <- pa2018 %>%
select(col) %>%
rename(PM25 = PM25_corrected) %>%
group_by(Lon, Lat, Timestamp) %>%
summarise(PM25 = mean(PM25))
pa <- pa[order(pa$Lon, pa$Lat, pa$Timestamp), ]
# Remove NAs
pa <- pa[complete.cases(pa), ]
# Get 12/21/2018 - 12/21/2018 data
pa$Timestamp <- as.POSIXct(pa$Timestamp)
startTime <- as.POSIXct('2018-12-21 00:00:00')
endTime <- as.POSIXct('2018-12-21 23:59:59')
pa <- subset(pa, Timestamp <= endTime)
pa <- subset(pa, Timestamp >= startTime)
start <- as.numeric(startTime)
end <- as.numeric(as.POSIXct('2018-12-21 23:00:00'))
diff <- end - start
# Prediction locations
# Longitude first, then latitude
s01 <- seq(-90,-65,0.1) # Lon
s02 <- seq(30,38,0.1) # Lat
s0 <- as.matrix(expand.grid(s01,s02))
innc <- map.where(map('state', 'north carolina', fill = TRUE), s0[,1], s0[,2])

s0 <- s0[!is.na(innc),]
X0 <- cbind(1,s0)
X0[,2] <- (X0[,2]-mean(X0[, 2])) / sd(X0[,2])
X0[,3] <- (X0[,3]-mean(X0[,3])) / sd(X0[,3])
lon <- X0[, 2]
lat <- X0[, 3]
lon2 <- lon * lon
lat2 <- lat * lat
lonlat <- lon * lat
X0 <- cbind(1, lat, lon, lat2, lon2, lonlat)
## Bayesian Kriging set up
library(spBayes)
library(viridis)
n.samples <- 25000
burn <- 5000
n_timestamp <- diff / 3600 + 1
one_var <- vector(length = n_timestamp)
for (i in 0:(n_timestamp - 1)) {
current <- as.POSIXct(3600 * i, origin = startTime)
subdf <- subset(pa, Timestamp == current)
S <- data.matrix(subdf[, 1:2])
maxd <- max(dist(S))
# Construct locations
X <- data.frame(cbind(1, S))
X[, 2] <- (X[, 2] - mean(X[, 2])) / sd(X[, 2])
X[, 3] <- (X[, 3] - mean(X[, 3])) / sd(X[, 3])
lon <- X[, 2]
lat <- X[, 3]
lon2 <- lon * lon
lat2 <- lat * lat
lonlat <- lon * lat
X <- cbind(1, lon, lat, lon2, lat2, lonlat)
print(dim(df))
# Kriging
Y <- subdf$PM25
starting <- list("phi"=1/(0.05*maxd), "sigma.sq"=0.5*var(Y), "tau.sq"=0.5*var(Y))
tuning <- list("phi"=1, "sigma.sq"=0.05*var(Y), "tau.sq"=0.05*var(Y))
amcmc <- list("n.batch"=n.samples/100, "batch.length"=100, "accept.rate"=0.43)
priors <- list("beta.Norm"=list(rep(0,6), 100*diag(6)),
"phi.Unif"=c(1/(2*maxd), 1/(0.01*maxd)),
"sigma.sq.IG"=c(2, 1),
"tau.sq.IG"=c(2, 1))
fit_spbayes <- spLM(Y~X-1, coords=S,
starting=starting, tuning=tuning,
priors=priors, cov.model="exponential",
amcmc=amcmc, n.samples=n.samples,verbose=FALSE)
pred <- spPredict(fit_spbayes, pred.coords=s0, pred.covars=X0,
start=burn, thin=10, verbose=FALSE)
pred <- pred$p.y.predictive.samples
Yhat <- apply(pred,1,mean)
Ysd <- apply(pred,1,sd)
df <- data.frame(long=s0[,1],lat=s0[,2],Y=Ysd)
# Samples near RTP area
# Longitude range: -78.89, -78.52; Latitude range: 35.71, 35.92
rtp_sample <- subset(df, df$long <=-78.52 & df$long >= -78.89 & df$lat >= 35.71
& df$lat <= 35.92)
rtp_std <- mean(rtp_sample$Y)
one_var[i + 1] = rtp_std
# Plot standard deviation
pred_var <- ggplot(df, aes(long, lat)) +
geom_polygon(aes(x=long, y=lat)) +
geom_raster(aes(fill = Y)) +
scale_fill_gradientn(colours = viridis(10))+
xlab(current)+ylab("")+labs(title="Posterior predictive standard deviation")+
coord_fixed()
plot(pred_var)
}
## NULL

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

map <- map_data("county")
nc <- subset(map, region =="north carolina")
R <- ggplot(data=nc) +
geom_polygon(aes(x=long, y=lat, group = group)) +
geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
coord_fixed()
R

###########################################################################
###########################################################################
###########################################################################
# 2020 Data
# Explore variance between 07/01/2020 - 07/07/2020
# 29 epa stations, 89 purple air stations (07/01)
###########################################################################
###########################################################################
###########################################################################
pa2020 <- read.csv('Data/NC_PA_FRM_PM/PA_2020_hourly_PM_NC.csv')
epa2020 <- read.csv('Data/NC_PA_FRM_PM/FRM_2020_hourly_PM_NC.csv')
# Subset data
col <- c('Lon', 'Lat', 'Timestamp', 'PM25_corrected')
pa <- pa2020 %>%
select(col) %>%
rename(PM25 = PM25_corrected) %>%
group_by(Lon, Lat, Timestamp) %>%
summarise(PM25 = mean(PM25))
pa <- pa[order(pa$Lon, pa$Lat, pa$Timestamp), ]
# Remove NAs
pa <- pa[complete.cases(pa), ]
# Process EPA data
epa2020$Timestamp <- paste(epa2020$Date_GMT, epa2020$Hour_GMT)
epa2020$Timestamp <- paste(epa2020$Timestamp, ':00', sep = '')
col <- c('Lon', 'Lat', 'Timestamp', 'PM25')
epa <- epa2020 %>%
select(col)
epa <- epa[order(epa$Lon, epa$Lat, epa$Timestamp), ]
# Get 07/01/2020 - 07/07/2020 data
pa$Timestamp <- as.POSIXct(pa$Timestamp)
epa$Timestamp <- as.POSIXct(epa$Timestamp, format = '%Y-%m-%d %H:%M:%OS')
startTime <- as.POSIXct('2020-07-01 00:00:00')
endTime <- as.POSIXct('2020-07-07 23:59:59')
pa <- subset(pa, Timestamp <= endTime & Timestamp >= startTime)
epa <- subset(epa, Timestamp <= endTime & Timestamp >= startTime)
start <- as.numeric(startTime)
end <- as.numeric(as.POSIXct('2020-07-07 23:00:00'))
diff <- end - start
epa$indicator = 1
pa$indicator = 0
df_combine <- rbind(epa, pa)
X0 <- cbind(X0, 0)
n_timestamp <- diff / 3600 + 1
one_var <- vector(length = n_timestamp)
# Model: Y = \beta_{0} + \beta_{1}*Lon + \beta_{2}*Lat + \beta_{3}*Lon^2 +
# \beta_{4}*Lat^2 + \beta_{5}*LonLat + \beta_{6}I(epa)
for (i in 0:(n_timestamp - 1)) {
current <- as.POSIXct(3600 * i, origin = startTime)
subdf <- subset(df_combine, Timestamp == current)
S <- data.matrix(subdf[, 1:2])
maxd <- max(dist(S))
# Construct locations
X <- data.frame(cbind(1, S))
X[, 2] <- (X[, 2] - mean(X[, 2])) / sd(X[, 2])
X[, 3] <- (X[, 3] - mean(X[, 3])) / sd(X[, 3])
lon <- X[, 2]
lat <- X[, 3]
lon2 <- lon * lon
lat2 <- lat * lat
lonlat <- lon * lat
X <- cbind(1, lon, lat, lon2, lat2, lonlat, subdf$indicator)
print(dim(df))
# Kriging
Y <- subdf$PM25
starting <- list("phi"=1/(0.05*maxd), "sigma.sq"=0.5*var(Y), "tau.sq"=0.5*var(Y))
tuning <- list("phi"=1, "sigma.sq"=0.05*var(Y), "tau.sq"=0.05*var(Y))
amcmc <- list("n.batch"=n.samples/100, "batch.length"=100, "accept.rate"=0.43)
priors <- list("beta.Norm"=list(rep(0,7), 100*diag(7)),
"phi.Unif"=c(1/(2*maxd), 1/(0.01*maxd)),
"sigma.sq.IG"=c(2, 1),
"tau.sq.IG"=c(2, 1))
fit_spbayes <- spLM(Y~X-1, coords=S,
starting=starting, tuning=tuning,
priors=priors, cov.model="exponential",
amcmc=amcmc, n.samples=n.samples,verbose=FALSE)
pred <- spPredict(fit_spbayes, pred.coords=s0, pred.covars=X0,
start=burn, thin=10, verbose=FALSE)
pred <- pred$p.y.predictive.samples
Yhat <- apply(pred,1,mean)
Ysd <- apply(pred,1,sd)
df <- data.frame(long=s0[,1],lat=s0[,2],Y=Ysd)
# Samples near RTP area
# Longitude range: -78.89, -78.52; Latitude range: 35.71, 35.92
rtp_sample <- subset(df, df$long <=-78.52 & df$long >= -78.89 & df$lat >= 35.71
& df$lat <= 35.92)
rtp_std <- mean(rtp_sample$Y)
one_var[i + 1] = rtp_std
# Plot standard deviation
pred_var <- ggplot(df, aes(long, lat)) +
geom_polygon(aes(x=long, y=lat)) +
geom_raster(aes(fill = Y)) +
scale_fill_gradientn(colours = viridis(10))+
xlab(current)+ylab("")+labs(title="Posterior predictive standard deviation")+
coord_fixed()
plot(pred_var)
}
## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

## [1] 1264 3

map <- map_data("county")
nc <- subset(map, region =="north carolina")
R <- ggplot(data=nc) +
geom_polygon(aes(x=long, y=lat, group = group)) +
geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
coord_fixed()
R

one_var
## [1] 1.9052051 1.5893965 1.5114837 1.6707842 1.4343314 1.6064139
## [7] 1.4748075 1.3460338 1.4606207 1.3099679 1.4841957 5.0005923
## [13] 0.9644108 1.3168099 3.0002070 1.9856533 1.4075152 1.1871802
## [19] 2.3033336 1.2084476 1.4426470 3.5227921 1.7692592 1.8061895
## [25] 1.5714654 1.6002801 1.6811134 1.5773866 1.7422996 1.7006562
## [31] 1.3775725 1.3680586 1.6216852 1.1399162 1.2727918 1.1216875
## [37] 1.2117824 1.4776425 1.5733690 1.0785898 0.8406213 1.0027079
## [43] 1.3518229 1.3199309 1.2229709 1.3069607 1.4428337 1.3916583
## [49] 1.5774606 3.4798832 2.7693829 3.3271570 2.6801845 3.0497978
## [55] 2.8179059 2.7081117 2.8899521 2.0161804 1.9808458 1.8333730
## [61] 2.0168364 1.7802773 1.6040024 1.4351711 0.9588706 1.8628268
## [67] 1.3064317 2.1206894 1.4795010 1.3306186 14.5551906 7.6889506
## [73] 1.9315503 3.2577544 3.5552235 4.2886842 4.4615923 5.3810691
## [79] 4.0807105 3.6968467 3.0630148 2.9682985 2.8998619 2.4043797
## [85] 2.4129995 2.6090009 2.3764793 2.6860665 2.2573893 1.7885452
## [91] 2.8479019 1.8238429 2.3087738 2.1931886 2.7035691 3.1249869
## [97] 2.8051889 16.4519602 25.0294728 19.9644258 16.6320573 16.7146707
## [103] 12.1522414 11.6501266 9.1198348 6.5903549 4.5559061 3.6969860
## [109] 2.9997822 2.5020343 2.3892037 2.6375189 2.1300700 1.8588336
## [115] 2.6904052 2.5550291 1.7681094 1.8333074 4.3819418 2.4056640
## [121] 1.9947108 3.8649705 2.5167407 1.9478483 2.0165614 1.8805844
## [127] 1.8095385 2.0423607 1.8088296 2.3581922 1.7220284 2.4644528
## [133] 1.7973765 1.7919945 2.2592240 1.1511978 1.9754137 1.7684362
## [139] 4.2892584 1.1722152 1.5803833 1.2530491 2.0851733 2.1094615
## [145] 1.5477366 1.8205729 2.1500770 1.5220204 2.1010737 1.5288082
## [151] 1.4488927 1.6211477 1.7482352 1.6639982 2.1853334 2.2754807
## [157] 2.0504857 1.7761231 1.6097263 1.6406558 1.1991648 1.6740114
## [163] 1.3819978 3.3999477 1.1871686 1.2596185 0.7915277 0.8660867
for (i in 1:7) {
plot(one_var[(24*(i-1)+1):((24*i))], type = 'b')
}







plot(one_var, type = 'b')
